%reset
import gzip
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy import stats
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# For exporting plotly graphs to HTML
import plotly.io as pio
pio.renderers.default='notebook'
# Load in the Video Game review data from http://jmcauley.ucsd.edu/data/amazon/
with gzip.open('reviews_Video_Games_5.json.gz', 'rb') as f:
df = pd.DataFrame([json.loads(line) for line in f])
# Remove all non-ascii characters and store in new column called 'ascii'
df['ascii'] = df['reviewText'].str.encode('ascii', 'ignore').str.decode('ascii')
# View any non-ascii characters by matching original and new column
print(df['reviewText'][df['ascii']!=df['reviewText']])
print('There are no reviews containing non-ascii characters!')
# Drop new column since no non-ascii characters
df = df.drop(columns=['ascii'])
# Label sentiments (Rating 1, 2 --> 0, Rating 3, 4, 5 --> 1)
df['label'] = df['overall'].apply(lambda x: 0 if x in [1, 2] else 1)
Series([], Name: reviewText, dtype: object) There are no reviews containing non-ascii characters!
# Show histogram to represent the imbalanced class problem
fig=px.histogram(df['label'])
fig.update_layout(title='Imbalanced Class Problem', xaxis_title='Sentiment Type')
fig.show()
print('There is a bias towards postiive sentiment in the data.')
print('The data must be evenly sampled (by removing excess positive sentiment) so that way the neural net does not train on over-biased data.')
# See reference (3)
# Make the amount of samples the same for positive and negative sentiment
df_no = df[df['label']==0]
df_yes = df[df['label']==1]
df_yes = df[df['label']==1][0:len(df_no)]
df_ml = pd.concat([df_no, df_yes])
# Show the updated histogram with the properly balanced sentiment for training
fig=px.histogram(df_ml['label'])
fig.update_layout(title='Corrected Balanced Samples', xaxis_title='Sentiment Type')
fig.show()
There is a bias towards postiive sentiment in the data. The data must be evenly sampled (by removing excess positive sentiment) so that way the neural net does not train on over-biased data.
# Specify imputs for the neural net
# Vocabulary size is the amount of total unique words to be used in the corpus and it pioritizes the most frequent words
vocab_size=10000
# Embeddding length is the length of the vector that is able to transform a vector to the given word
# The typical length of the embedding vector is 100-200, where higher dimensions yield diminishing returns
embedding_dim=128
# The maximum sequence length is the longest sentence sequence length the data will take before it cuts off the remainder
# A larger sequence length will increase computation resources and may make the model worse
max_length=130 # The maximum amount of words in any given sequence (cuts off after max_length words)
# Tokenization Process: TensorFlow finds each word present in the data and vectorizes the dictionary
# Words are tokenized by converting to a matrix. Each column represents a word, and a 0 or 1 represents the
# presence or lack of a word.
# The goal of the tokenization process is to convert sentence strings to numerical representations to
# input the data into a neural network (which cannot take in strings)
# The data will be split into 70% training data, and 30% testing data. This means that 70% of the data will be
# used to train the model, and 30% of the data will be used for validation accuracy. A random state will be chosen
# just in case the data is sequentially biased. Splitting the ~57k reviews 70:30 will ensure that the model does
# is not overfitted. This will be done with sklearn's train_test_split method.
# There will be one category of sentiment for the activiation function of the final dense layer of the network.
# 1/2 star reviews will be converted to a negative sentiment of '0' and 3/4/5 star reviews will be converted to
# a positive sentiment of '1'
# Create training data
X = df_ml['reviewText']
y = df_ml['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create the tokenizer from the training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
# The padding process is used to standardize each sentence in a data set. It does this by taking a maximum sentence length
# (sequence length), and then either truncates or pads the sentence depending on its length. If the sentence is too long,
# the sentence is truncated down to maximum sequence length by cutting off each word after the cut-off length.
# The setence is padded if the sentence's sequence length is shorter than the maximum length, so the sentence will be padded
# with '0' up to the maximum sequence length either before (pre) or after (post) the sentence. The padding occurs
# after the sequence in this notebook.
# Create training data for neural net model, and pad the sequences
training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_sequences, maxlen=max_length, truncating='post', padding='post')
print('The following is a screenshot of an example of a padded sequence:')
print(training_padded[0])
# Create testing data for neural net model, and pad the sequences
testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, truncating='post', padding='post')
The following is a screenshot of an example of a padded sequence:
[ 21 205 21 849 15 2 249 12 31 9 47 26 48 417
6 1139 55 25 2 63 16 22 708 21 51 21 2073 81
796 2 314 276 9 782 2 321 212 11 2 419 9 434
2 114 2503 296 4388 880 6 51 4627 5573 862 3 2720 2927
3 2 187 756 9 354 21 79 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]
# from openpyxl import load_workbook
# # Save the prepared dataset
# path='C:\\Users\\qmeye\\OneDrive\\Documents\\WGU\\D213\\Task-2\\cleaned_data.xlsx'
# book=load_workbook(path)
# writer=pd.ExcelWriter(path, engine='openpyxl')
# writer.book=book
# y_test.to_excel(writer, sheet_name='y_test')
# y_train.to_excel(writer, sheet_name='y_train')
# df_training_padded=pd.DataFrame(training_padded)
# df_testing_padded=pd.DataFrame(testing_padded)
# df_training_padded.to_excel(writer, sheet_name='training_padded')
# df_testing_padded.to_excel(writer, sheet_name='testing_padded')
# writer.save()
# writer.close()
# Statistical justification for maximum sequence length
df_maxlen=pd.DataFrame(np.ones(len(training_sequences))) # Allocate memory
# For every value in the list find the tota length
for i in range(0,len(df_maxlen)):
df_maxlen.iloc[i]=len(training_sequences[i])
# Plot the histogram for the total number of words in each review
print(np.percentile(df_maxlen, 75))
fig=px.histogram(df_maxlen)
fig.add_vline(x=np.median(df_maxlen), line_dash='dash', line_color='red',
annotation_text='The median count is ' + str(np.median(df_maxlen)),
annotation_font_color='red',
annotation_font_size=20)
fig.add_vline(x=np.percentile(df_maxlen, 75), line_dash='dash', line_color='orange',
annotation_text='The 75th percentile is ' + str(np.percentile(df_maxlen, 75)),
annotation_font_color='orange',
annotation_font_size=20,
annotation_position='bottom right')
fig.update_layout(title='Histogram of Number of Words in each Review',
xaxis_title='Bins for Number of Words')
fig.update(layout_showlegend=False)
fig.show()
print('The maximum number of words chosen will be 130 as this is a good statistical distribution for the data.')
print('Using a higher maximum word count might yield an overfitted model, and increase the instances of padded sequences.')
261.0
The maximum number of words chosen will be 130 as this is a good statistical distribution for the data. Using a higher maximum word count might yield an overfitted model, and increase the instances of padded sequences.
tf.keras.backend.clear_session() # Clear backend session to iterate model
# * Sequential model simply means that the neural net is a model that input or output is a sequence of data
# In this case, the input is a text stream so we use tf.keras.Sequential
# * Dense layers are simple layer of neurons in which each neuron receives input from all neurons of the
# previous layer. This is important when building an interconnected model
# The model will use 4 layers in total (4)
# 1. The Embedding layer embeds the vocabulary down to the embedding_dim length which allows for quicker
# computations and less overfitting. The output is a [max_length x embedding_dim] size matrix.
# 2. The GlobalAveragingPooling1D layer compresses the [max_length x embedding_dim] size matrix down to a
# 1-Dimensional [1 x embedding_dim] size matrix. This step is important to prevent overfitting.
# 3. The Dense Relu layer is a hidden layer that outputs a [1x16] size layer that helps prevent
# overfitting and decreases complexity of the model. l2 penalty helps prevent overfitting the data further
# since there were issues of increasing accuracy but decreasing validation_accuracy
# 4. A dropout layer is included to prevent overfitting and to increase validation_accuracy by
# randomly dropping out data by the rate specified (ex; 0.2=20% dropped)
# 4. The Dense Sigmoid layer is a hidden layer that outputs a [1x1] size matrix. Since this is a binary
# classification problem, the sigmoid function is important in classifying the final output as either a
# '0' which is negative sentiment, or '1' which is positive sentiment.
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(1e-4)),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# A loss function of 'binary_crossentropy' is used. Since a binary classification problem is being performed,
# binary_crossentropy uses entropy from the sigmoid function (logistic regression) to compute the influence
# of each output. A higher loss is given if there is less reason for it occuring. For example, if a user gave
# a 1-star review but used positive sentiment in their review (ie; 'This product is GREAT!: 1-star), then that
# would provide a large amount of entropy (5)
# The 'adam' optimizer is used. Adam is based on gradient descent with the addition of several instances of
# momentum, acceleration, and directioanlity. Adam is a good optimizer to use due to its speed and ease of
# directionality
# Accuracy is used as the metric for optimization and loss. Accuracy is the percentage of correct values that
# the model predicts according to the actual value. There are separate accuracies and losses for both the training
# and testing data sets.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Provide the output of the model summary of the function from TensorFlow
model.summary()
# Number of epochs specified are the total number of cycles for training the neural net
num_epochs=10
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 130, 128) 1280000
global_average_pooling1d (G (None, 128) 0
lobalAveragePooling1D)
dense (Dense) (None, 16) 2064
dropout (Dropout) (None, 16) 0
dense_1 (Dense) (None, 1) 17
=================================================================
Total params: 1,282,081
Trainable params: 1,282,081
Non-trainable params: 0
_________________________________________________________________
# Establish stopping criteria. The neural net will train for the total number of epochs unless a stopping
# criteria is specified. In this case, if validation loss 'val_loss' is not increasing any further, the
# model will stop training despite the specified number of epochs after the accuracy fails to increase by
# the number specified by patience
es = EarlyStopping(monitor='val_loss', mode='max', verbose=1, patience=3)
# Train the model with .fit()
history=model.fit(training_padded, y_train, epochs=num_epochs,
validation_data=(testing_padded, y_test), callbacks=es)
Epoch 1/10 1248/1248 [==============================] - 20s 16ms/step - loss: 0.4182 - accuracy: 0.8270 - val_loss: 0.3286 - val_accuracy: 0.8719 Epoch 2/10 1248/1248 [==============================] - 19s 15ms/step - loss: 0.2933 - accuracy: 0.8905 - val_loss: 0.3169 - val_accuracy: 0.8718 Epoch 3/10 1248/1248 [==============================] - 19s 15ms/step - loss: 0.2591 - accuracy: 0.9059 - val_loss: 0.3142 - val_accuracy: 0.8765 Epoch 4/10 1248/1248 [==============================] - 19s 15ms/step - loss: 0.2361 - accuracy: 0.9152 - val_loss: 0.3159 - val_accuracy: 0.8722 Epoch 4: early stopping
# Visualizations
# Get accuracy and loss information from the model's history and plot them vs epochs
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs=np.arange(0,len(acc))
fig=make_subplots(rows=2,cols=1,
subplot_titles=('Training and Validation Accuracies', 'Training and Validation Losses'))
fig.add_trace(go.Line(
x=epochs,
y=val_acc, name='Validation Accuracy'),
row=1, col=1)
fig.add_trace(go.Line(
x=epochs,
y=acc, name='Training Accuracy'),
row=1, col=1)
fig.add_trace(go.Line(
x=epochs,
y=val_loss, name='Validation Loss'),
row=2, col=1)
fig.add_trace(go.Line(
x=epochs,
y=loss, name='Training Loss'),
row=2, col=1)
fig.update_xaxes(title_text='epochs', row=2, col=1)
fig.update_yaxes(title_text='Accuracy Percentage', row=1, col=1)
fig.update_yaxes(title_text='Loss', row=2, col=1)
c:\Users\qmeye\OneDrive\Documents\WGU\D208\NBM2\churn\venv\lib\site-packages\plotly\graph_objs\_deprecations.py:378: DeprecationWarning: plotly.graph_objs.Line is deprecated. Please replace it with one of the following more specific types - plotly.graph_objs.scatter.Line - plotly.graph_objs.layout.shape.Line - etc.
# The model's accuracy increases, but the validation accuracy slightly decreases with number of epochs.
# This suggests that the model trains very quickly, but it quickly overfits. In order to decrease
# this overfitting, a L2 regularization penalty was applied which offsets the error terms. Additionally,
# a dropout layer was introduced which randomly takes the rate inputted and drops it out for that
# particular epoch.
# The ending predictive accuracy of the network on the training data is around 94%, but the ending
# validation accuracy is around 86%. While the training accuracy improves from 82%, the validation
# accuracy actually decreases from 87%. With this considered, the predictive accuracy of the network
# is about 90% in total.
model.save('sentimentModel.h5') # Save the final neural net
# Now that the model is saved, the model can be loaded in and used for additional data as needed.
# For example, if a supplier wanted to assess sentiment of reviews on the amazon page, they could
# simply prepare the data as in this Jupyter notebook, load in the model, and transform the data with
# the model to derive sentiments. See below for an example.
# The model is able to be trained quickly, but just as quickly begins to overfit. This suggests that
# the simple model implemented is effective, and the data might not be terribly complex since there
# was an adequate amount of training data. The resulting model is very robust with around 90% accuracy.
# For next steps, it is recommended to implement this model in the pipeline for 'Video Game' reviews.
# Additional modeling and vocabulary selection might be needed before implementing this model for other
# Amazon categories. Data Analysts can use this model to predict postiive or negative sentiment, and then
# specifically perform other word processing techniques to determine if customer's are leaving accurate
# or just reviews for any given product. This is a potential issue due to bots leaving negative reviews,
# or customers not understanding the review process. It is imperative that Amazon's suppliers are being
# treated justly under scrutiny of review.
# Example of how to implement the model after creation
sentence=['This product sucks. I hate this company',
'This game did not have the best graphics so I would really not recommend it',
'This product is perfect. I really love this game.',
'I wish the game would\'ve been better, but overall it was an amazing game',
'I have a neutral opinion on this game']
sequences = tokenizer.texts_to_sequences(sentence)
padded=pad_sequences(sequences, maxlen=max_length,
padding='post',
truncating='post')
print(model.predict(padded))
1/1 [==============================] - 0s 48ms/step [[0.0306033 ] [0.38701177] [0.7962683 ] [0.634207 ] [0.47962844]]
(1) https://keras.io/guides/sequential_model/
(2) https://towardsdatascience.com/how-to-choose-the-right-activation-function-for-neural-networks-3941ff0e6f9c
(3) https://www.analyticsvidhya.com/blog/2020/07/10-techniques-to-deal-with-class-imbalance-in-machine-learning/
(4) https://www.linkedin.com/pulse/choosing-number-hidden-layers-neurons-neural-networks-sachdev/
(5) https://towardsdatascience.com/understanding-binary-cross-entropy-log-loss-a-visual-explanation-a3ac6025181a
(6) https://towardsdatascience.com/a-complete-step-by-step-tutorial-on-sentiment-analysis-in-keras-and-tensorflow-ea420cc8913f
(7) https://www.youtube.com/watch?v=mdKjMPmcWjY